In [1]:
import pandas as pd, sklearn as sl, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import pylev
import re
import warnings
warnings.filterwarnings("ignore")
In [2]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap.umap_ import UMAP
from random import randint
from datetime import datetime

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas_profiling import ProfileReport


%matplotlib inline
In [3]:
# Se carga el dataFrame
RentalBikesDF = pd.read_csv(r"C:\Users\LGARCIA\OneDrive - Valorem\LGARCIA\Documents\04. Estudio\05. Maestria\Ciencia Aplicada de Datos\Taller No. 2\Data\Data_Taller_2.csv",delimiter=';')
In [4]:
# LIMPIEZA Y PREPARACION DE DATOS (25 pts)
# Búsqueda y corrección de valores atípicos, valores faltantes y duplicados. Debido a que la
# base de datos no es muy grande, deberá abstenerse de eliminar registros. Así mismo,
# busque la oportunidad de generar una nueva variable con base en la información
# suministrada.
In [5]:
# Esta limpieza la vamos a dividir en dos. Primero: Limpieza - revisar columna por columna los valores y tipos de datos, 
# buscando datos atipicos, fechas que no sean correctas, datos Nan para reemplazarlos por datos que tengan un poco más de 
# sentido, conversión de tipos de datos, etc. Segunda parte: Preparación de Datos asociados al clima. Crearemos una columnas 
# booleanas que nos indiquen, por ejemplo, si ese registro llovio o nevó.
In [6]:
# Los primeros comandos es para entender que datos tenemos, cantidad de registros y cantidad de nulos.
RentalBikesDF.shape
Out[6]:
(731, 12)
In [7]:
RentalBikesDF.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   instant     731 non-null    int64 
 1   Date        731 non-null    object
 2   Season      731 non-null    object
 3   Holiday     731 non-null    object
 4   Weather     731 non-null    object
 5   Temp        727 non-null    object
 6   Feel_Temp   731 non-null    object
 7   Hum         731 non-null    object
 8   Wind        731 non-null    object
 9   Casual      731 non-null    int64 
 10  Registered  731 non-null    int64 
 11  cnt         731 non-null    int64 
dtypes: int64(4), object(8)
memory usage: 68.7+ KB
In [8]:
RentalBikesDF
Out[8]:
instant Date Season Holiday Weather Temp Feel_Temp Hum Wind Casual Registered cnt
0 1 1/01/2020 Springer No Mist + Cloudy 8,175849 9,090375 0,805833 10,749882 331 654 985
1 2 2/01/2020 Springer No Mist + Cloudy 9,083466 8,625733 0,696087 -16,652113 131 670 801
2 3 3/01/2020 Springer No Few clouds 1,229108 0,902035 0,437273 16,636703 120 1229 1349
3 4 4/01/2020 Springer No Few clouds 1,4 1,969734 0,590435 10,739832 108 1454 1562
4 5 5/01/2020 Springer No Partly cloudy 2,666979 2,77569 0,436957 12,5223 82 1518 1600
... ... ... ... ... ... ... ... ... ... ... ... ...
726 727 27/12/2021 Springer No Mist + Cloudy 3,945849 2,652174 0,652917 23,458911 247 1867 2114
727 728 28/12/2021 Springer No Mist + Few clouds 3,906651 3,987162 0,59 10,416557 644 2451 3095
728 729 29/12/2021 Springer No Mist 3,906651 3,3928 0,752917 8,333661 159 1182 1341
729 730 30/12/2021 Springer No Few clouds 4,024151 2,8899 0,483333 23,500518 364 1432 1796
730 731 31/12/2021 Springer No Mist + Few clouds 2,144151 2,503889 0,5775 10,374682 439 2290 2729

731 rows × 12 columns

In [9]:
# Creating a lambda expression for datetime parsing * Esta función la adapté del bono
dateparse = lambda x: datetime.strptime(x, "%d/%m/%Y")
In [10]:
# Creating a function for validating which value is causing the previous error * Esta función la adapté del bono
def error_in_format(x):
    try:
        datetime.strptime(x, "%d/%m/%Y")
        return False
    except:
        return True
In [11]:
RentalBikesDF.loc[RentalBikesDF["Date"].apply(error_in_format)]
Out[11]:
instant Date Season Holiday Weather Temp Feel_Temp Hum Wind Casual Registered cnt
424 425 29/02/2021 Springer No Mist 8,184356 8,37809 0,804783 12,000839 65 1769 1834
In [12]:
# 29 de Febrero no existe en el calendario, así que la reemplacé por 1 de Marzo. 

RentalBikesDF['Date'].replace('29/02/2021','01/03/2021', inplace=True)
RentalBikesDF["Date"] = RentalBikesDF["Date"].apply(dateparse)
In [13]:
# No se evidencian datos correspondientes fuera de un rango logico.

RentalBikesDF["Date"].min(),RentalBikesDF["Date"].max()
Out[13]:
(Timestamp('2020-01-01 00:00:00'), Timestamp('2021-12-31 00:00:00'))
In [14]:
RentalBikesDF["Day"]=RentalBikesDF["Date"].dt.dayofweek
In [15]:
RentalBikesDF["Day"].unique()
Out[15]:
array([2, 3, 4, 5, 6, 0, 1], dtype=int64)
In [16]:
# no se evidencian datos nulos o nan

RentalBikesDF['Season'].unique()
RentalBikesDF['Season'].replace('Springer','Spring', inplace=True)
In [17]:
# no se evidencian datos nulos o nan

RentalBikesDF['Holiday'].unique()
Out[17]:
array(['No', 'Yes'], dtype=object)
In [18]:
# Se evidencian problemas en la recolección de datos. Se crea un set de datos y mediante la funcion levenshtein se normalizan 
# los datos.

RentalBikesDF['Weather'].unique()
Out[18]:
array(['Mist + Cloudy', 'Few clouds', ' Partly    cloudy', 'Clear',
       'Mist', 'Mist + Broken clouds', ' Partly cloudy',
       'Mist + Few clouds', 'Rain + Thunderstorm',
       'Light Rain + Scattered clouds', 'Snow'], dtype=object)
In [19]:
RentalBikesDF['Weather']=RentalBikesDF['Weather'].str.strip()
RentalBikesDF['Weather']=RentalBikesDF['Weather'].str.upper()
In [20]:
Weather_list = ['MIST + CLOUDY', 'FEW CLOUDS', 'PARTLY CLOUDY', 'MIST', 'MIST + BROKEN CLOUDS','MIST + FEW CLOUDS','RAIN + THUNDERSTORM','LIGHT RAIN + SCATTERED CLOUDS','SNOW','CLEAR']
In [21]:
RentalBikesDF.loc[~RentalBikesDF['Weather'].isin(Weather_list), "Weather"].unique()
Out[21]:
array(['PARTLY    CLOUDY'], dtype=object)
In [22]:
def corregirWeather(x):
    distancia = 1
    if pylev.levenshtein('MIST + CLOUDY', x) <= distancia: 
        return 'MIST + CLOUDY'
    elif pylev.levenshtein('FEW CLOUDS', x) <= distancia:
        return 'FEW CLOUDS'
    elif pylev.levenshtein('PARTLY CLOUDY', x) <= distancia:
        return 'PARTLY CLOUDY'
    elif pylev.levenshtein('MIST', x) <= distancia:
        return 'MIST'
    elif pylev.levenshtein('MIST + BROKEN CLOUDS', x) <= distancia:
        return 'MIST + BROKEN CLOUDS'
    elif pylev.levenshtein('MIST + FEW CLOUDS', x) <= distancia:
        return 'MIST + FEW CLOUDS'
    elif pylev.levenshtein('RAIN + THUNDERSTORM', x) <= distancia:
        return 'RAIN + THUNDERSTORM'
    elif pylev.levenshtein('SNOW', x) <= distancia:
        return 'SNOW'
    elif pylev.levenshtein('CLEAR', x) <= distancia:
        return 'CLEAR'
    else:
        distancia = distancia + 1
        if pylev.levenshtein('MIST + CLOUDY', x) <= distancia: 
            return 'MIST + CLOUDY'
        elif pylev.levenshtein('FEW CLOUDS', x) <= distancia:
            return 'FEW CLOUDS'
        elif pylev.levenshtein('PARTLY CLOUDY', x) <= distancia:
            return 'PARTLY CLOUDY'
        elif pylev.levenshtein('MIST', x) <= distancia:
            return 'MIST'
        elif pylev.levenshtein('MIST + BROKEN CLOUDS', x) <= distancia:
            return 'MIST + BROKEN CLOUDS'
        elif pylev.levenshtein('MIST + FEW CLOUDS', x) <= distancia:
            return 'MIST + FEW CLOUDS'
        elif pylev.levenshtein('RAIN + THUNDERSTORM', x) <= distancia:
            return 'RAIN + THUNDERSTORM'
        elif pylev.levenshtein('SNOW', x) <= distancia:
            return 'SNOW'
        elif pylev.levenshtein('CLEAR', x) <= distancia:
            return 'CLEAR'
        else:
            distancia = distancia + 1
            if pylev.levenshtein('MIST + CLOUDY', x) <= distancia: 
                return 'MIST + CLOUDY'
            elif pylev.levenshtein('FEW CLOUDS', x) <= distancia:
                return 'FEW CLOUDS'
            elif pylev.levenshtein('PARTLY CLOUDY', x) <= distancia:
                return 'PARTLY CLOUDY'
            elif pylev.levenshtein('MIST', x) <= distancia:
                return 'MIST'
            elif pylev.levenshtein('MIST + BROKEN CLOUDS', x) <= distancia:
                return 'MIST + BROKEN CLOUDS'
            elif pylev.levenshtein('MIST + FEW CLOUDS', x) <= distancia:
                return 'MIST + FEW CLOUDS'
            elif pylev.levenshtein('RAIN + THUNDERSTORM', x) <= distancia:
                return 'RAIN + THUNDERSTORM'
            elif pylev.levenshtein('SNOW', x) <= distancia:
                return 'SNOW'
            elif pylev.levenshtein('CLEAR', x) <= distancia:
                return 'CLEAR'
            else:
                return x
In [23]:
RentalBikesDF['Weather']=RentalBikesDF['Weather'].apply(corregirWeather)
In [24]:
RentalBikesDF.loc[~RentalBikesDF['Weather'].isin(Weather_list), "Weather"].unique()
Out[24]:
array([], dtype=object)
In [25]:
RentalBikesDF['Temp'].unique()
Out[25]:
array(['8,175849', '9,083466', '1,229108', '1,4', '2,666979', '1,604356',
       '1,236534', '-0,245', '-1,498349', '-0,910849', '-0,052723',
       '0,118169', '-0,43911', '2,966651', '2,888349', '0,264151',
       '2,183349', '5,732178', '4,298349', '0,3425', '-5,2208712',
       '-3,4634801', '-3,4226089', '2,503466', '2,2225', '1,165',
       '1,563466', '2,176534', '0,499151', '1,032178', '4,22', '0,786979',
       '1,931288', '5,434151', '4,768349', '2,379151', '-1,665199',
       '-1,215644', '0,887277', '2,4575', '6,876534', '11,505',
       '4,506089', '6,958267', '12,484151', '16,518349', '10,760849',
       '5,405199', '6,256651', '0,564434', '2,421733', '5,895644',
       '9,124356', '5,2775', '8,143466', '11,141831', '4,533349', '7,745',
       '1,321651', '10,055849', '9,696534', '4,301733', '5,7475',
       '5,904151', '10,287277', '7,470849', '10,064356', '7,285199',
       '6,917377', '9,165199', '17,38', '14,2075', '7,6275', '12,230445',
       '12,758349', '8,306979', '5,395', '4,415849', '4,494151',
       '3,893021', '4,424356', '6,2175', '6,1', '4,611651', '6,805',
       '9,781651', '18,946651', '11,465849', '10,369151', '12,5625',
       '7,784151', '8,0975', nan, '19,995644', '15,6175', '11,3875',
       '13,9725', '12,993349', '12,249151', '13,463349', '16,0875',
       '15,774151', '19,965', '13,580849', '7,823349', '13,62',
       '19,338349', '20,513349', '21,688349', '21,14', '21,0225', '15,97',
       '13,228349', '17,810849', '20,983349', '14,520849', '16,44',
       '16,831651', '17,0275', '17,4975', '17,145', '16,479151',
       '18,4375', '19,1425', '18,398349', '17,85', '16,949151',
       '17,223349', '20,3175', '20,395849', '23,02', '23,059151',
       '25,291651', '24,038349', '22,824151', '23,3725', '26,466651',
       '28,425', '27,915849', '25,605', '21,845', '22,471651',
       '23,881651', '25,2525', '28,464151', '29,991651', '27,485',
       '26,075', '24,5475', '21,453349', '21,531651', '22,510849',
       '24,743349', '24,860849', '23,999151', '26,231651', '26,035849',
       '24,665', '23,96', '24,0775', '26,975849', '25,9575', '26,701651',
       '25,683349', '26,153349', '27,093349', '25,84', '27,25',
       '25,330849', '27,1325', '27,8375', '29,325849', '23,176651',
       '24,273349', '25,800849', '28,503349', '28,111651', '30,305',
       '31,871651', '31,910849', '31,01', '26,936651', '28,268349',
       '28,620849', '31,401651', '29,795849', '29,874151', '28,816651',
       '26,388349', '25,37', '25,409151', '26,8975', '27,955',
       '28,033349', '25,7225', '24,234151', '23,803349', '23,294151',
       '24,939151', '25,996651', '24,195', '24,7825', '24,508349',
       '22,119151', '23,646651', '24,155849', '24,9', '25,231773',
       '21,923349', '22,040849', '22,863349', '22,785', '22,236651',
       '23,450849', '20,160849', '21,793911', '22,55', '22,706651',
       '22,284356', '22,589151', '14,050849', '15,108349', '15,8525',
       '20,630849', '21,805849', '18,515849', '11,27', '8,763349',
       '14,755849', '17,301651', '15,225849', '16,009151', '17,419151',
       '18,829151', '18,633349', '17,536651', '19,690849', '17,889151',
       '15,813349', '16,048349', '17,105849', '17,461733', '14,364151',
       '12,0925', '11,8575', '11,818349', '13,776651', '14,168349',
       '14,09', '7,549151', '3,945849', '7,000849', '7,98', '10,839151',
       '9,7425', '11,191651', '10,956651', '7,353349', '8,371651',
       '10,565', '10,8', '9,86', '7,235849', '12,719151', '16,91',
       '8,058349', '4,885849', '13,0325', '11,583349', '9,546651',
       '9,625', '9,664151', '15,663466', '13,541651', '7,275', '6,6875',
       '6,765849', '6,060849', '10,134151', '13,7375', '5,669151',
       '4,925', '3,201651', '6,9225', '4,141651', '5,003349', '12,131651',
       '11,896651', '4,914801', '7,121733', '6,05911', '3,671651',
       '6,648349', '9,39', '4,833021', '-0,95', '-2,9475', '7,705849',
       '10,486651', '7,8625', '2,535849', '6,508712', '9,9775', '0,46',
       '-0,166651', '0,93', '9,533021', '0,146651', '-0,3625', '2,261651',
       '5,825849', '11,975', '6,844151', '4,650849', '10,33', '6,726651',
       '5,282623', '8,645849', '4,063349', '4,455', '5,199151', '-2,0075',
       '6,883349', '8,136651', '8,293349', '5,16', '5,527822',
       '10,604151', '13,345849', '11,1525', '5,120849', '9,233349',
       '8,880849', '8,184356', '14,834151', '8,606651', '7,314151',
       '3,436651', '10,995849', '16,7925', '11,309151', '5,5125',
       '9,001733', '13,933349', '18,555', '18,9075', '18,2025',
       '16,165849', '17,615', '18,359151', '16,988349', '18,045849',
       '20,278349', '12,954151', '7,196651', '11,935849', '12,014151',
       '12,393911', '17,458349', '12,445', '15,5', '14,990849',
       '8,388712', '10,6825', '12,7975', '15,265', '23,215849',
       '20,591651', '15,421651', '16,753349', '18,79', '10,643349',
       '7,118349', '11,426651', '14,403349', '13,5025', '9,703349',
       '13,815849', '20,826651', '18,32', '21,4925', '21,218349',
       '17,2625', '19,025', '17,066651', '20,7875', '20,748349',
       '19,886651', '20,2', '21,179151', '20,121651', '20,905', '24,43',
       '25,4875', '22,745849', '19,416651', '20,0825', '25,879151',
       '19,8475', '18,711651', '24,351651', '28,7775', '28,5425',
       '25,644151', '21,649151', '27,210849', '31,205849', '30,344151',
       '28,738349', '28,699151', '29,090849', '30,8925', '30,931651',
       '32,498349', '30,6575', '25,056651', '27,054151', '30,461651',
       '29,286651', '28,19', '20,004151', '26,858349', '27,289151',
       '27,524151', '25,918349', '26,349151', '25,526651', '27,3675',
       '27,994151', '28,150849', '26,584151', '25,213349', '21,884151',
       '24,704151', '25,0175', '23,098349', '27,406651', '26,114151',
       '26,623349', '22,980849', '20,67', '21,766651', '19,26',
       '19,299151', '21,296651', '17,9675', '17,693349', '16,870849',
       '21,100849', '19,769151', '22,9025', '11,544151', '10,016651',
       '14,011651', '13,424151', '16,5575', '18,476651', '14,9125',
       '17,575849', '19,6125', '17,654151', '14,4425', '12,68',
       '6,954554', '8,8025', '9,194151', '8,685', '8,552178', '8,998349',
       '10,290849', '11,779151', '14,795', '5,590849', '8,215',
       '9,899151', '9,585849', '9,311651', '5,081651', '3,554151',
       '5,708349', '5,943349', '5,20089', '6,021651', '8,3325', '13,2675',
       '12,601651', '4,024151', '7,079151', '9,938349', '5,9825',
       '5,238349', '9,0375', '7,51', '2,871288', '5,691288', '3,906651',
       '2,144151'], dtype=object)
In [26]:
# Se evidencian datos nan y problemas con el tipo de dato y la coma. Para el dato nan vamos a insertarle un valor que se 
# evidencie facilmente como outlayer para despues poderlo transformar. La idea es al final asignarle la misma sensación 
# termica. Por último cambiamos las comas por puntos y cambiamos el tipo de dato.

RentalBikesDF['Temp']=RentalBikesDF['Temp'].replace(np.nan, '5000,0')
In [27]:
RentalBikesDF['Temp']=RentalBikesDF['Temp'].replace(',', '.',regex=True)
RentalBikesDF['Temp']=RentalBikesDF['Temp'].astype(float, errors = 'raise')
In [28]:
# Se evidencia el outlayer

RentalBikesDF['Temp'].min(),RentalBikesDF['Temp'].max()
Out[28]:
(-5.2208712, 5000.0)
In [29]:
RentalBikesDF['Feel_Temp'].unique()
Out[29]:
array(['9,090375', '8,625733', '0,902035', '1,969734', '2,77569',
       '2,960823', '1,815433', '-0,374062', '-2,539775', '-0,908264',
       '0,998808', '-0,457769', '-0,908499', '0,855411', '3,661264',
       '3,008199', '0,308237', '2,919651', '6,025834', '3,98735',
       '-0,581849', '-4,2837288', '-3,3545623', '-2,45729', '3,022722',
       '1,5692', '2,3259', '2,495899', '1,969922', '3,765134', '0,75375',
       '3,02291', '3,957599', '0,360266', '2,743589', '3,423726',
       '5,708537', '6,271926', '1,317562', '-1,218699', '-0,971244',
       '2,034923', '2,948838', '7,233311', '10,72245', '3,950878',
       '6,8614', '12,146926', '16,063201', '10,395988', '5,03451',
       '5,351525', '0,743551', '3,548699', '5,591977', '8,471667',
       '5,263024', '8,502123', '10,805546', '4,402313', '7,043337',
       '1,406251', '4,016913', '9,802613', '9,213844', '3,207667',
       '6,2128', '5,470576', '10,126396', '6,335', '7,31025', '9,864277',
       '7,604', '6,954366', '9,24571', '11,285651', '16,769423',
       '13,926675', '11,257545', '12,710174', '7,883133', '4,729151',
       '4,046664', '4,105837', '3,765933', '4,105978', '5,766676',
       '5,97545', '4,106025', '5,322338', '6,834939', '9,802049',
       '17,517663', '10,217576', '12,383712', '7,250513', '8,051863',
       '12,056639', '18,565199', '15,173538', '11,612301', '13,748874',
       '12,769911', '11,998124', '12,947712', '15,647862', '14,995126',
       '18,526424', '13,332924', '7,131838', '13,155687', '17,932861',
       '19,0015', '19,921901', '19,031674', '19,209663', '15,380761',
       '13,808987', '13,065588', '17,043151', '19,357713', '11,01855',
       '12,770099', '14,283499', '16,093187', '16,389851', '16,686562',
       '16,567887', '16,8348', '16,598061', '15,2321', '15,529563',
       '17,192', '17,874064', '17,310863', '16,776426', '16,004874',
       '16,864974', '18,882825', '19,743912', '20,426211', '20,930474',
       '22,770336', '21,939376', '20,781813', '20,93085', '23,541324',
       '26,093001', '25,885449', '22,265274', '19,595251', '19,950712',
       '20,989788', '21,227326', '22,82965', '26,182113', '27,606213',
       '25,054724', '23,867786', '22,236275', '20,277738', '19,802662',
       '19,624438', '19,981262', '20,218001', '22,265838', '22,354762',
       '21,969362', '24,609963', '24,610151', '22,859401', '22,235711',
       '21,968563', '21,939188', '24,550226', '22,651614', '23,363476',
       '23,423025', '23,274599', '24,727886', '24,224751', '24,282937',
       '23,512701', '23,215426', '24,431175', '26,300788', '26,745925',
       '24,401988', '21,849888', '21,345437', '21,998361', '23,482151',
       '25,084475', '27,131513', '27,10195', '30,839437', '31,522112',
       '29,801489', '29,356963', '25,885026', '24,758013', '24,461349',
       '26,7753', '28,940449', '26,241239', '26,300412', '25,232337',
       '23,957039', '23,245036', '22,858649', '23,779238', '25,618724',
       '25,054301', '26,033687', '24,194201', '22,621487', '22,739974',
       '22,355326', '21,346236', '20,959849', '22,354339', '23,333537',
       '23,126126', '21,761387', '22,502812', '23,749675', '21,997938',
       '20,485149', '21,642524', '22,355138', '23,007451', '21,871132',
       '22,454073', '20,574026', '19,951088', '20,722687', '20,901287',
       '20,425976', '21,761011', '23,275163', '21,405362', '16,2144',
       '17,578763', '18,101967', '19,210462', '20,574214', '20,633763',
       '20,30011', '20,367038', '21,4643', '18,022537', '13,689325',
       '14,490064', '15,055239', '16,894725', '17,014199', '17,875051',
       '18,083261', '16,539875', '18,527364', '18,913939', '19,684974',
       '19,002675', '19,032426', '19,002064', '17,606963', '11,404561',
       '8,229899', '10,426162', '14,224326', '16,775486', '14,579975',
       '15,706988', '16,122374', '16,627201', '17,517475', '17,666512',
       '16,332699', '17,934788', '15,440075', '15,648238', '16,004075',
       '16,150856', '11,909012', '11,849651', '13,482149', '13,778625',
       '14,223762', '13,481162', '6,984164', '2,711911', '7,102463',
       '8,734961', '10,663136', '10,336251', '11,078287', '10,959424',
       '7,221138', '9,030826', '10,840937', '11,375562', '11,226713',
       '9,564887', '6,420399', '8,823274', '12,23585', '16,656764',
       '15,856213', '13,243436', '7,191387', '4,817887', '7,250701',
       '13,481726', '12,917914', '11,819712', '12,235239', '9,506137',
       '9,891537', '10,099089', '13,4226', '15,035734', '13,214625',
       '6,627387', '6,361038', '7,577351', '6,598388', '8,4077',
       '10,514475', '13,4508', '10,811562', '4,076086', '6,924474',
       '4,521364', '3,898238', '4,699212', '6,153486', '7,903014',
       '11,375139', '8,911775', '3,720437', '3,542213', '5,203851',
       '10,633338', '12,116799', '12,027687', '9,743111', '6,064374',
       '5,158167', '6,830145', '7,398751', '5,158778', '4,402924',
       '11,463687', '9,654187', '3,858288', '-2,065075', '-2,391161',
       '5,085364', '7,992549', '10,366613', '7,992126', '3,631513',
       '6,984822', '5,292587', '9,951086', '3,720014', '0,605089',
       '-0,403625', '0,961161', '9,121066', '4,936938', '0,931786',
       '2,385026', '0,219125', '-0,37425', '8,408076', '5,856587',
       '8,73435', '11,523001', '7,339813', '4,343375', '9,921899',
       '13,927286', '10,751637', '6,539262', '4,818075', '4,432487',
       '5,932022', '8,971888', '4,521787', '4,313436', '5,797226',
       '1,910749', '-3,222074', '7,695462', '8,526563', '7,517614',
       '8,704975', '4,492036', '4,849377', '5,870311', '10,455349',
       '12,888962', '11,315637', '4,016725', '4,610476', '8,823838',
       '8,615675', '8,37809', '14,342437', '8,912574', '11,434124',
       '6,241987', '3,335037', '3,986974', '10,0997', '16,656388',
       '10,662901', '5,055049', '8,90449', '13,600824', '17,784999',
       '17,042775', '12,502763', '15,737162', '13,808', '17,042587',
       '17,311051', '16,123126', '16,983649', '18,793149', '14,876451',
       '12,561936', '12,62125', '6,835738', '14,13465', '14,668288',
       '11,820276', '11,612489', '12,093111', '13,689701', '17,07215',
       '12,264661', '10,366049', '12,028063', '15,143975', '14,401986',
       '12,502951', '7,851878', '10,217388', '12,294976', '14,935812',
       '18,972125', '20,901475', '20,128889', '13,480786', '15,173162',
       '16,241425', '17,517287', '10,306688', '6,152875', '11,048301',
       '14,104899', '14,728401', '13,273939', '9,742688', '13,511712',
       '19,119987', '17,281112', '17,250374', '19,773099', '19,476576',
       '17,696639', '16,775674', '18,201137', '17,992175', '15,113801',
       '16,479151', '17,606399', '19,506186', '17,8453', '19,090988',
       '20,010825', '18,912811', '17,902687', '18,644676', '19,446449',
       '18,586349', '19,298775', '19,476764', '20,5149', '21,493863',
       '22,206712', '22,146975', '23,9271', '23,630624', '20,723063',
       '21,672463', '18,615536', '19,031251', '19,179301', '16,716924',
       '17,487724', '18,763774', '20,099514', '22,473249', '23,185299',
       '23,006887', '20,100125', '20,722499', '21,346001', '20,188438',
       '19,951276', '17,607574', '22,770524', '25,885825', '27,369474',
       '22,681224', '23,689938', '22,117224', '23,749064', '28,970811',
       '24,312876', '27,279563', '24,995786', '25,02455', '26,419839',
       '27,784249', '27,369051', '29,830911', '29,148612', '22,740538',
       '23,245412', '22,562737', '22,770101', '23,392851', '23,303974',
       '25,144212', '26,033875', '27,517101', '27,042401', '25,588174',
       '20,812175', '17,845864', '21,286875', '24,430799', '25,20315',
       '26,745361', '26,508199', '24,787388', '24,194389', '23,156112',
       '25,233136', '25,974749', '27,309549', '26,360713', '25,381562',
       '24,788187', '25,262699', '24,876876', '23,393274', '21,998549',
       '22,295213', '23,125938', '23,808613', '22,739739', '3,3928',
       '21,049337', '20,010449', '20,248175', '21,227138', '22,4137',
       '21,078712', '20,011812', '23,30435', '21,879451', '22,680613',
       '24,4018', '25,024738', '22,503', '23,096563', '24,283736',
       '25,322201', '22,800463', '22,888588', '19,209475', '18,585738',
       '18,051724', '18,793525', '19,239226', '19,535749', '18,466875',
       '17,99335', '18,558149', '17,398988', '17,013024', '18,882637',
       '20,692936', '15,618111', '17,576413', '20,040811', '20,574825',
       '19,535561', '16,923912', '16,330161', '16,064', '17,489651',
       '20,159251', '19,268789', '17,310487', '11,731211', '12,591264',
       '15,647674', '12,264849', '12,354337', '10,395612', '15,885588',
       '17,34005', '13,659762', '13,155076', '16,093375', '14,223574',
       '13,452163', '14,698274', '16,924288', '18,259887', '16,895336',
       '16,56925', '16,211251', '13,985237', '12,6518', '6,565723',
       '8,9717', '9,387274', '8,733974', '7,220762', '7,488286',
       '6,479525', '5,233649', '4,907187', '8,068877', '8,704411',
       '10,515039', '11,820511', '14,343001', '7,191575', '5,233461',
       '7,251124', '8,318588', '7,340001', '7,874062', '9,891349',
       '9,149924', '8,467437', '3,690874', '4,106401', '7,933188',
       '5,233226', '5,618814', '7,221749', '6,894488', '8,882776',
       '13,422412', '14,045538', '12,116564', '4,135588', '7,132026',
       '10,306876', '10,336862', '12,472025', '7,903061', '5,974886',
       '5,826836', '5,827024', '7,904001', '9,387086', '10,8705',
       '11,256276', '8,081614', '7,755199', '6,183049', '3,097311',
       '4,195137', '4,1683', '5,839855', '2,355651', '2,652174',
       '3,987162', '2,8899', '2,503889'], dtype=object)
In [30]:
# No see evidencian datos nan pero si problemas con el tipo de dato y la coma. Esto lo corregimos con un reflace y astype

RentalBikesDF['Feel_Temp']=RentalBikesDF['Feel_Temp'].replace(',', '.',regex=True)
RentalBikesDF['Feel_Temp']=RentalBikesDF['Feel_Temp'].astype(float, errors = 'raise')
In [31]:
RentalBikesDF['Feel_Temp'].min(),RentalBikesDF['Feel_Temp'].max()
Out[31]:
(-4.2837288, 31.522112)
In [32]:
RentalBikesDF['Hum'].unique()
Out[32]:
array(['0,805833', '0,696087', '0,437273', '0,590435', '0,436957',
       '0,518261', '0,498696', '0,535833', '0,434167', '0,482917',
       '0,686364', '0,599545', '0,470417', '0,537826', '0,49875',
       '0,48375', '0,5375', '0,861667', '0,741739', '0,538333',
       '0,457083', '0,4', '0,436522', '0,491739', '0,616957', '0,8625',
       '0,6875', '0,793043', '0,651739', '0,722174', '0,60375',
       '0,829565', '0,775417', '0,437826', '0,585217', '0,929167',
       '0,568333', '0,738333', '0,537917', '0,494783', '0,437391',
       '0,506364', '0,544167', '0,457391', '0,375833', '0,314348',
       '0,423478', '0,505', '0,516667', '0,187917', '0,407826', '0,605',
       '0,577778', '0,423043', '0,697391', '0,712174', '0,68', '0,876364',
       '0,535', '0,449583', '0,318333', '0,610417', '0,789167',
       '0,948261', '0,551304', '0,420833', '0', '0,649565', '0,594583',
       '0,527391', '0,496957', '0,655652', '0,776522', '0,602917',
       '0,525217', '0,379167', '0,47375', '0,737391', '0,624583',
       '0,839565', '0,495', '0,394167', '0,493913', '0,302174',
       '0,314167', '0,646667', '0,918333', '0,68625', '0,65375', '0,48',
       '0,42625', '0,642083', '0,470833', '0,83625', '0,8775', '0,8575',
       '0,716956', '0,739167', '0,819167', '0,540417', '0,67125',
       '0,888333', '0,479583', '0,5425', '0,665833', '0,614167',
       '0,407083', '0,729583', '0,887917', '0,810833', '0,776667',
       '0,729167', '0,835417', '0,700833', '0,503333', '0,762083', '0,73',
       '0,697083', '0,737083', '0,444167', '0,59', '0,54125', '0,631667',
       '0,58875', '0,489167', '0,632917', '0,7475', '0,863333', '0,9225',
       '10,867083', '0,787917', '0,837917', '0,87', '0,829583',
       '0,719583', '0,626667', '0,749583', '0,81', '0,740833', '0,69625',
       '0,6775', '0,81875', '0,685', '0,636667', '0,677083', '0,305',
       '0,354167', '0,45625', '0,6525', '0,6', '0,597917', '0,622083',
       '0,654583', '0,747917', '0,494583', '0,507083', '0,471667',
       '0,688333', '0,735833', '0,670417', '0,666667', '0,74625',
       '0,770417', '0,7075', '0,703333', '0,573333', '0,483333',
       '0,513333', '0,658333', '0,634167', '0,497917', '0,39625',
       '0,444583', '0,6825', '0,637917', '0,590417', '0,743333',
       '0,65125', '0,757917', '0,609167', '0,578333', '0,635833',
       '0,559167', '0,47625', '0,59125', '0,585', '0,604167', '0,650417',
       '0,707083', '0,69125', '0,580417', '0,5', '0,550833', '0,757083',
       '0,540833', '0,402917', '0,583333', '0,465833', '0,480833',
       '0,49125', '0,6575', '0,7575', '0,630833', '0,755', '0,752917',
       '0,592083', '0,570417', '0,424167', '0,42375', '0,415', '0,8175',
       '0,712083', '0,575417', '0,722917', '0,674167', '0,77', '0,47',
       '0,455417', '0,771667', '0,76125', '0,85', '0,561765', '0,554583',
       '0,548333', '0,639167', '0,727083', '0,716667', '0,742083',
       '0,790417', '0,886957', '0,917083', '0,939565', '0,897917',
       '0,75375', '0,71375', '0,692174', '0,7125', '0,709167', '0,718333',
       '0,695', '0,69', '0,88125', '0,9', '0,902083', '0,9725', '0,845',
       '0,848333', '0,885417', '0,84875', '0,699167', '0,6475',
       '0,791667', '0,760833', '0,71', '0,647917', '0,620833', '0,684167',
       '0,70125', '0,7275', '0,73375', '0,80875', '0,90625', '0,896667',
       '0,71625', '0,486667', '0,579583', '0,701667', '0,895217',
       '0,63625', '0,574167', '0,629167', '0,74125', '0,772083',
       '0,622917', '0,720417', '0,812917', '0,585833', '0,8825',
       '0,62375', '0,68375', '0,71875', '0,702083', '0,6225', '0,519167',
       '0,734583', '0,75875', '0,721667', '0,758333', '0,813333',
       '0,44625', '0,552917', '0,458333', '0,587083', '0,68875', '0,93',
       '0,575833', '0,41', '0,502083', '0,684583', '0,91', '0,9625',
       '0,549167', '0,64375', '0,681667', '0,698333', '0,743043',
       '0,830833', '0,613333', '0,524583', '0,625833', '0,612917',
       '0,775833', '0,827083', '0,949583', '0,970417', '0,58', '0,695833',
       '0,5075', '0,49', '0,670833', '10,66375', '0,500417', '0,560833',
       '0,58625', '0,6375', '0,595417', '0,858333', '0,681304',
       '0,506957', '0,7625', '0,503913', '0,615833', '0,6925', '0,381304',
       '0,44125', '0,414583', '0,524167', '0,542083', '0,531667', '0,465',
       '0,646522', '0,8475', '0,802917', '0,4575', '0,419167', '0,5225',
       '0,716087', '0,443333', '0,4975', '0,45', '0,83125', '0,79625',
       '0,91125', '0,835833', '0,769583', '0,543333', '0,31125',
       '0,400833', '0,416667', '0,507917', '0,672917', '0,526667',
       '0,779583', '0,687917', '0,622174', '0,49625', '0,562083', '0,54',
       '0,73125', '0,464583', '0,41125', '0,50875', '0,53125', '0,634583',
       '0,534583', '0,515833', '0,507826', '0,594348', '0,567917',
       '0,7375', '0,395833', '0,490833', '0,804783', '0,615417',
       '0,657083', '0,62125', '0,403333', '0,50625', '0,456667', '0,5675',
       '0,350417', '10,476957', '0,6175', '0,842083', '0,755833',
       '0,72875', '0,807917', '0,82125', '0,694167', '0,880833',
       '0,477917', '0,29', '0,48125', '0,439167', '0,580833', '0,67625',
       '0,504348', '0,396667', '0,469583', '0,374167', '0,377083',
       '0,254167', '0,275833', '0,3175', '0,435', '0,469565', '0,46625',
       '0,408333', '0,502917', '0,561667', '0,390417', '0,569167',
       '0,6125', '0,694583', '0,682917', '0,766667', '0,454167',
       '0,427917', '0,756667', '0,489583', '0,57', '0,659583', '0,797083',
       '0,768333', '0,735417', '0,74', '0,664167', '0,685833', '0,744167',
       '0,552083', '0,360417', '0,480417', '0,57625', '0,789583',
       '0,794583', '0,697917', '0,52', '0,523333', '0,530417', '0,81125',
       '0,765833', '0,774583', '0,747083', '0,7325', '0,67', '0,492917',
       '0,755417', '0,493333', '0,487083', '0,61125', '0,567083',
       '0,467917', '0,437083', '0,587917', '0,833333', '0,582083',
       '0,569583', '0,589583', '0,504167', '0,59875', '0,777917',
       '0,57375', '0,479167', '0,373333', '0,36', '0,4225', '0,48875',
       '0,60125', '0,51875', '0,447083', '0,492083', '0,53875',
       '0,457917', '0,450833', '0,683333', '0,6675', '0,633333',
       '0,529583', '0,485833', '0,717917', '0,645', '0,505833',
       '0,577083', '0,600417', '0,844167', '0,865417', '0,655',
       '0,596667', '0,66875', '0,704167', '0,6425', '0,654167', '0,70375',
       '0,620417', '0,715833', '0,732917', '0,545417', '0,686667',
       '0,619583', '0,570833', '0,603333', '0,711667', '0,734167',
       '0,67375', '0,615', '0,712917', '0,845833', '0,730417', '0,62',
       '0,5875', '0,638333', '0,815', '0,790833', '0,810417', '0,73625',
       '0,799167', '0,5475', '0,50375', '0,637083', '0,6725', '0,501667',
       '0,8725', '0,536667', '0,618333', '0,467083', '0,690833',
       '0,542917', '0,649167', '0,871667', '0,79375', '0,6275',
       '0,708333', '0,709583', '0,761667', '0,463333', '0,539167',
       '0,640417', '0,558333', '0,692917', '0,728333', '0,572917', '0,51',
       '0,641667', '0,800417', '0,807083', '0,72', '0,88', '0,825455',
       '0,581667', '0,522083', '0,532917', '0,494167', '0,333478',
       '0,645417', '0,659167', '0,741667', '0,662917', '0,623333',
       '0,61375', '0,56875', '0,404583', '0,468333', '0,535417',
       '0,786667', '0,555652', '0,649583', '0,806667', '0,823333',
       '0,7675', '0,485', '0,764167', '0,905417', '0,925', '0,642917',
       '0,83875', '0,907083', '0,66625', '0,625417', '0,667917',
       '0,556667', '10,44125', '0,515417', '0,791304', '0,734783',
       '0,652917', '0,5775'], dtype=object)
In [33]:
# No see evidencian datos nan pero si problemas con el tipo de dato y la coma. Esto lo corregimos con un reflace y astype

RentalBikesDF['Hum']=RentalBikesDF['Hum'].replace(',', '.',regex=True)
RentalBikesDF['Hum']=RentalBikesDF['Hum'].astype(float, errors = 'raise')
In [34]:
RentalBikesDF['Hum'].min(),RentalBikesDF['Hum'].max()
Out[34]:
(0.0, 10.867083)
In [35]:
RentalBikesDF['Wind'].unique()
Out[35]:
array(['10,749882', '-16,652113', '16,636703', '10,739832', '12,5223',
       '6,0008684', '11,304642', '17,875868', '24,25065', '14,958889',
       '8,182844', '20,410009', '20,167', '8,478716', '10,583521',
       '12,625011', '12,999139', '9,833925', '13,957239', '13,125568',
       '23,667214', '11,52199', '16,5222', '10,60811', '8,696332',
       '19,68795', '7,627079', '8,2611', '9,739455', '4,9568342',
       '12,541864', '3,565271', '17,708636', '18,609384', '8,565213',
       '10,792293', '9,5006', '3,0423561', '12,652213', '14,869645',
       '7,27285', '13,625589', '17,479161', '27,999836', '19,522058',
       '16,869997', '15,416968', '17,749975', '34,000021', '14,956745',
       '20,625682', '13,110761', '6,305571', '16,783232', '23,218113',
       '12,500257', '8,391616', '19,408962', '14,500475', '20,624811',
       '15,125518', '13,624182', '16,875357', '23,000229', '22,870584',
       '8,08355', '14,75005', '17,545759', '15,60899', '14,791925',
       '18,130468', '9,174042', '12,348703', '13,608839', '14,041793',
       '15,478139', '24,667189', '13,917307', '19,348461', '15,12525',
       '15,695487', '16,333729', '15,458575', '14,041257', '12,3481',
       '14,217668', '15,208732', '11,583496', '14,582282', '17,333436',
       '13,208782', '12,208271', '25,833257', '26,000489', '17,625221',
       '10,874904', '15,208464', '8,916561', '9,833389', '21,739758',
       '18,416893', '16,791339', '7,4169', '15,167125', '22,834136',
       '20,334232', '10,958989', '10,584057', '16,208975', '21,792286',
       '14,707907', '12,875725', '12,417311', '21,8755', '20,9174',
       '21,500836', '16,084221', '15,750025', '7,125718', '12,291418',
       '22,958689', '22,042732', '19,791264', '15,292482', '10,75015',
       '5,0007125', '11,792', '7,749957', '8,083014', '12,707689',
       '12,041575', '9,04165', '10,249593', '8,500357', '18,582718',
       '13,499964', '7,250271', '8,375871', '9,916536', '15,667414',
       '13,875164', '10,333611', '13,376014', '16,125493', '15,416164',
       '14,333846', '8,792075', '7,459043', '19,583229', '16,959107',
       '8,250514', '9,292364', '8,167032', '12,583136', '9,166739',
       '10,042161', '9,417118', '10,37495', '20,45845', '18,041961',
       '11,250104', '13,833557', '9,582943', '8,000336', '6,834',
       '10,416825', '11,458675', '11,541554', '15,999868', '14,875675',
       '6,3337311', '7,208396', '9,666961', '17,542007', '12,415904',
       '6,874736', '7,709154', '15,333486', '5,4591064', '8,459286',
       '10,6664', '15,083643', '12,292557', '18,916579', '13,417018',
       '9,790911', '16,124689', '12,249811', '13,958914', '16,417211',
       '14,458868', '8,7502', '7,625739', '14,875407', '8,9177',
       '8,791807', '11,334457', '6,0841561', '13,417286', '12,292021',
       '11,958093', '11,667246', '11,291979', '11,042471', '10,500039',
       '13,79195', '9,084061', '13,20905', '12,374632', '15,29275',
       '13,499629', '10,125107', '11,041332', '8,416607', '14,167418',
       '14,916411', '13,999918', '15,834043', '9,625689', '15,624936',
       '9,333636', '6,999289', '16,666518', '18,54225', '9,833121',
       '16,958236', '14,125811', '5,6254875', '25,166339', '20,412153',
       '10,708275', '8,375536', '5,5833311', '9,500332', '9,375243',
       '12,416775', '13,833289', '14,250632', '23,044181', '6,5003936',
       '12,914116', '8,333393', '10,291736', '7,708618', '5,957171',
       '9,500868', '11,2091', '18,166782', '11,000261', '12,708225',
       '11,958361', '10,166714', '9,041918', '6,4590814', '8,584375',
       '5,2505689', '5,2516811', '3,3754064', '7,917457', '9,958143',
       '11,583161', '13,833825', '19,583832', '14,874871', '5,5841686',
       '13,792218', '11,87575', '1,5002439', '3,0420814', '4,25115',
       '2,8343814', '9,583814', '16,62605', '9,499729', '15,000161',
       '17,291561', '18,875039', '11,750393', '7,375829', '16,303713',
       '28,292425', '14,833532', '6,2086689', '6,6673375', '7,959064',
       '11,166086', '9,959014', '13,250121', '15,375093', '23,541857',
       '11,833339', '7,12545', '9,083257', '5,5001439', '18,209193',
       '12,667154', '6,1676314', '3,834075', '4,6255125', '4,1671186',
       '12,667489', '21,083225', '14,208154', '18,875307', '20,541932',
       '13,375411', '9,167543', '20,459254', '11,291711', '15,041232',
       '12,45865', '9,249618', '22,500275', '11,209368', '-6,6260186',
       '4,5841936', '9,522174', '17,292164', '18,167586', '14,750586',
       '6,750518', '6,4174811', '5,6252061', '4,1679561', '15,583061',
       '17,833725', '16,083886', '5,5420189', '15,625807', '4,4582939',
       '9,41685', '4,0842061', '17,958814', '17,458525', '16,292189',
       '11,375193', '11,584032', '4,1252436', '14,8338', '3,167425',
       '18,374482', '12,750368', '10,391097', '16,044155', '12,62615',
       '19,695387', '8,000604', '9,000579', '14,750318', '12,875189',
       '22,087555', '24,499957', '12,3749', '8,709129', '11,249836',
       '11,708786', '12,833314', '6,6263', '12,565984', '12,124789',
       '25,333236', '12,541261', '16,834286', '15,500986', '23,39171',
       '27,833743', '13,58425', '14,917014', '13,375746', '7,417436',
       '8,292389', '10,791757', '4,9175186', '14,125543', '16,08335',
       '14,458064', '17,541739', '12,541529', '11,959232', '11,791732',
       '10,3046', '9,874393', '8,959307', '13,000479', '7,834243',
       '19,416332', '27,417204', '11,207961', '9,458993', '12,1672',
       '6,125475', '13,791682', '12,792243', '16,958504', '15,348561',
       '13,783039', '15,709557', '12,791171', '15,916989', '28,250014',
       '13,750343', '17,958211', '12,958939', '12,000839', '15,208129',
       '9,708568', '22,416257', '13,458625', '23,167193', '29,584721',
       '27,7916', '14,913329', '13,916771', '15,87565', '7,583864',
       '7,417168', '8,501161', '10,875239', '8,125157', '6,0004061',
       '7,876654', '7,7921', '12,916461', '25,917007', '19,541957',
       '21,41655', '9,250489', '11,541889', '20,913313', '6,708911',
       '12,125325', '14,708443', '20,125996', '18,416357', '15,583932',
       '23,999132', '16,708125', '19,783358', '19,458743', '10,416557',
       '12,791439', '19,083543', '18,333143', '4,4172564', '10,041357',
       '19,000329', '23,084582', '16,708661', '11,833875', '23,291411',
       '8,708325', '7,832836', '11,499746', '10,458432', '9,249886',
       '8,957632', '10,916846', '10,250464', '10,041893', '15,458307',
       '19,833943', '14,499604', '21,042221', '15,874779', '8,249911',
       '15,082839', '14,250364', '9,875264', '8,208304', '15,374825',
       '5,626325', '17,042589', '15,624668', '7,917189', '13,332464',
       '14,416457', '13,166907', '19,7918', '9,000043', '13,083693',
       '15,916721', '12,499654', '12,333829', '19,083811', '14,041525',
       '5,167375', '10,54245', '11,750661', '9,667229', '14,374582',
       '22,999693', '17,000111', '11,166689', '11,707982', '9,917139',
       '7,625404', '7,958729', '12,250414', '12,041307', '9,750175',
       '20,125661', '23,292014', '18,208925', '11,50055', '11,082939',
       '11,291443', '13,082889', '8,457879', '12,999943', '9,791514',
       '10,958118', '8,417143', '10,166379', '10,166111', '5,41695',
       '9,626493', '11,000529', '7,666743', '9,208614', '11,083743',
       '14,000789', '14,2911', '6,2926936', '9,291761', '11,0416',
       '19,082471', '11,416532', '10,292339', '11,083475', '8,666718',
       '17,249686', '19,458207', '10,4587', '16,000471', '13,834093',
       '9,126204', '11,333586', '11,374657', '15,500718', '11,917089',
       '5,79215', '8,708593', '4,8756436', '4,7089811', '5,6679186',
       '4,8337686', '16,375336', '8,625111', '12,791975', '7,541654',
       '5,1668189', '7,583529', '4,2927436', '15,833507', '9,542207',
       '11,500282', '18,833968', '17,333771', '8,833682', '5,5422936',
       '6,958821', '16,583907', '6,0422811', '23,958329', '14,416725',
       '10,333343', '19,000061', '14,958286', '9,541068', '16,3748',
       '9,000914', '-10,999993', '15,249468', '9,042186', '6,0838814',
       '6,999825', '4,4585686', '7,875582', '17,957675', '9,457854',
       '12,708493', '12,7501', '12,584007', '12,166932', '15,751164',
       '18,667004', '19,834479', '12,208807', '6,791857', '7,874979',
       '11,125618', '5,4593811', '6,3345686', '4,8762064', '8,333125',
       '8,875289', '15,791364', '26,666536', '23,9994', '14,271603',
       '10,542182', '18,125443', '12,000236', '15,833775', '11,625371',
       '20,375236', '23,304945', '14,375386', '3,8756686', '8,5425',
       '11,625639', '22,917082', '13,374875', '10,250129', '12,041843',
       '15,250004', '15,749489', '5,542575', '6,917482', '3,5423436',
       '9,917407', '25,250357', '10,0835', '3,12555', '15,916654',
       '14,125007', '7,739974', '3,9175436', '4,0001814', '11,666643',
       '21,709407', '11,708518', '6,792393', '10,584325', '12,750636',
       '10,916779', '8,792343', '6,749714', '6,5833061', '14,834068',
       '12,334164', '8,875021', '25,083661', '27,292182', '5,1744368',
       '21,208582', '23,458911', '8,333661', '23,500518', '10,374682'],
      dtype=object)
In [36]:
# No see evidencian datos nan pero si problemas con el tipo de dato y la coma. Esto lo corregimos con un reflace y astype

RentalBikesDF['Wind']=RentalBikesDF['Wind'].replace(',', '.',regex=True)
RentalBikesDF['Wind']=RentalBikesDF['Wind'].astype(float, errors = 'raise')
In [37]:
RentalBikesDF['Wind'].min(),RentalBikesDF['Wind'].max()
Out[37]:
(-16.652113, 34.000021)
In [38]:
RentalBikesDF['Casual'].unique()
Out[38]:
array([ 331,  131,  120,  108,   82,   88,  148,   68,   54,   41,   43,
         25,   38,  222,  251,  117,    9,   78,   83,   75,   93,  150,
         86,  186,   34,   15,  123,  140,   42,   47,   72,   61,  100,
        354,   64,   53,  149,  288,  397,  208,  218,  259,  579,  532,
        639,  195,   74,  139,  424,  694,   81,  137,  231,  214,  640,
        114,  244,  316,  191,   46,  247,  724,  982,  359,  289,  321,
        884, 1424, 1047,  401,  460,  203,  166,  300,  981,  472,  317,
        168,  179,  307,  898, 1651,  734,  167,  413,  571,  172,  879,
       1188,  855,  257,  209,  529,  642,  121, 1558,  669,  409,  613,
        745,  177, 1462, 1710,  773,  678,  547,  569,  878, 1965, 1138,
        847,  603,  255,  614,  894, 1612, 1401,  664,  550,  695,  692,
        902, 1582,  536,  735,  909, 2258, 1576,  836,  659,  740,  758,
        871, 2001, 2355, 1549,  673,  513,  736, 1869, 1685,  763,  676,
        563,  815, 1729, 1467,  863,  727,  769,  545, 1807, 1639,  699,
        774,  661,  746,  969, 1782, 1920,  854,  732,  848, 1027, 1246,
       2204, 2282, 3065, 1031,  784,  754, 1988, 1743,  723,  662,  748,
        888, 1318, 2418, 2006,  841,  752,  644,  632,  562,  987, 1050,
        568,  750,  755,  606,  670, 1559, 1524,  729,  801,  467,  799,
       1023, 1521, 1298,  846,  907,  812, 1051, 1504, 1338,  775,  721,
        668,  797, 1914, 1249,  833, 1281,  949,  435,  768,  226, 1415,
        688,  783,  875, 1935, 2521, 1236,  204,  118,  153,  417, 1750,
       1633,  690,  701,  647,  428,  742, 1434, 1353,  691,  438,  539,
        555,  258, 1776, 1544,  684,  477,  480,  653,  830,  616,  330,
        486,  559, 2235, 2397, 1514,  667,  217,  290, 1899, 1748,  713,
        637,  254,  471, 1499, 1619,  404,  240,  456,   57,  885,  362,
        410,  370,  318,  470, 1156,  952,  373,  376,  305,  190,  440,
       1275, 1004,  595,  449,  145,  245,  943,  787,  220,   69,  112,
        560, 1095,  810,  253,   96,  188,  182,  268,  706,  634,  233,
        126,   50,  261,  502,  377,  143,  155,  178,  181,  275,  260,
        216,  107,  227,  163,  303,  430,  103,  491,  665,  686,   89,
         95, 1070,  599,  106,  173,   92,  269,  174,  333,  284,  127,
        109,  130,  115,   67,  196,  439,  558,  324,  304,  310,  384,
        206,  199,  192,   73,   94,  135,  141,  349, 1435,  618,  394,
        516,  246,  515,  229,   65,  325,  956,  710,  221,  432,  447,
        968, 1658,  838,  762,  997, 1005,  548, 3155, 2207, 1122, 1334,
       2469, 1033, 1532,  795,  531,  674,  834,  796, 2301, 2347, 1208,
       1348, 1058, 1192, 3252, 2230,  905,  819,  482,  663, 1252, 2795,
       2846, 1198,  989,  347, 1340, 2541,  518,  655,  475, 1014, 1120,
       2229,  764, 1069, 2496, 2135, 1008,  738,  620, 1026, 1319, 2622,
       2172,  342,  625,  991, 1242, 3410, 2704,  630,  766, 1059, 1417,
       2855, 3283, 2557,  880, 1100,  533, 2494, 1071, 1038, 1488, 2708,
       2224, 1017, 1173, 1180, 1563, 2963, 2634,  872,  778,  964, 2657,
       2551, 1139, 1077,  921,  829, 1455, 1421,  904, 1052, 2562, 1405,
       1366, 1448, 1203,  998,  954,  975, 1032, 1511, 1088,  747, 1264,
       2544, 1135, 1140, 1383, 1036, 1259, 2234, 2153, 1040, 1074,  983,
       1328, 2345, 1707, 1233, 1278, 1263, 1196, 1065, 2247, 2182, 1207,
       1128, 1483, 2827, 1081, 1094, 1363, 1325, 1829,  935, 1177, 1172,
       1433, 2352, 2613,  867,  832,  611, 1045, 1557, 2570, 1118, 1054,
       1379, 3160, 2166, 1022,  371,  788,  939, 1250, 2512, 2454, 1001,
        845,  751, 2589, 2015,  315,  728,  891, 1516, 3031,  781,  874,
        601,  780, 1060, 2252, 2080,  760,  922,  979,  753, 2806, 2132,
       1182, 2643,    2,   87,  419,  466, 1029, 1201,  378,  326,  340,
        709, 2090, 2290, 1097,  327,  320,  484, 1313,  534,  615,  955,
       1603,  309,  337,  198,  243,  951,  892,  551, 1153,  441,  329,
        282,  425,  429,  767,  538,  212,  433,  314,  205,  408,  159,
        364], dtype=int64)
In [39]:
RentalBikesDF['Casual'].min(),RentalBikesDF['Casual'].max()
Out[39]:
(2, 3410)
In [40]:
RentalBikesDF['Registered'].unique()
Out[40]:
array([ 654,  670, 1229, 1454, 1518, 1362,  891,  768, 1280, 1220, 1137,
       1368, 1367, 1026,  953,  883,  674, 1572, 1844, 1468,  888,  836,
       1330, 1799,  472,  416, 1129,  975,  956, 1459, 1313, 1489, 1620,
        905, 1269, 1592, 1466, 1552, 1491, 1597, 1184, 1192, 1705, 1675,
       1897, 2216, 2348, 1103, 1173,  912, 1376, 1778, 1707, 1341, 1545,
       1708, 1365, 1714, 1903, 1562, 1730, 1437,  491, 1628, 1817, 1700,
        577, 1408, 1435, 1687, 1767, 1871, 2320, 2355, 1693, 1424, 1676,
       2243, 1918, 1699, 1910, 1515, 1221, 1806, 2108, 1506, 1920, 1354,
       1598, 2381, 2395, 2570, 1299, 1576, 2493, 1777, 1953, 2738, 2484,
       2186, 2760, 2795, 3331, 3444, 2574, 2481, 3300, 3722, 3325, 3489,
       3717, 3347, 2213, 3554, 3848, 2378, 3819, 3714, 3102, 2932, 3698,
       4109, 3632, 4169, 3413, 2507, 2971, 3185, 3445, 3319, 3840, 4008,
       3547, 3084, 3438, 3833, 4238, 3919, 3808, 2757, 2433, 2549, 3309,
       3461, 4232, 4414, 3473, 3221, 3875, 4070, 3725, 3352, 3771, 3237,
       2993, 4157, 4164, 4411, 3222, 3981, 3312, 3105, 3311, 4061, 3846,
       4044, 4022, 3420, 3385, 3854, 3916, 4377, 4488, 4116, 2915, 2367,
       2978, 3634, 3845, 3838, 3348, 3138, 3363, 3596, 3594, 4196, 4220,
       3505, 3296, 3617, 3789, 3688, 3152, 2825, 2298, 2556, 3272, 3901,
       3784, 3176, 2916, 2778, 3537, 3107, 3777, 3843, 2773, 2487, 3480,
       3695, 3896, 3980, 2646, 2482, 3563, 4004, 4026, 3166, 3356, 3277,
       2624, 3925, 4614, 4181, 3893,  889, 2919, 3905, 4429, 4370, 4332,
       3852, 2419, 2115, 2506, 1878, 1689, 3127, 3595, 4023, 4062, 4138,
       3231, 4018, 3077, 2921, 3203, 3813, 4240, 2137, 3647, 3466, 3946,
       3643, 3427, 4186, 4372, 1949, 2302, 3240, 3970, 4267, 4126, 4036,
       3174, 3114, 3603, 2199, 2623, 3115, 3318, 3293, 3857, 4111, 2170,
       3724, 3628, 2809, 2762, 3488, 3992, 3490, 3291,  570, 2446, 3307,
       3658, 3816, 3656, 3576, 2770, 2697, 3662, 3829, 3804, 2743, 2928,
       2792, 2713, 3891, 3746, 1672, 2914, 3147, 2720, 2733, 2545, 1538,
       2454,  935, 1697, 1819, 2261, 3614, 2818, 3425, 3545, 3672, 2908,
       2851, 3578, 2468,  655, 3172, 3359, 2688, 2366, 3167, 3368, 3562,
       3528, 3399, 2464, 2211, 3143, 3534, 2553, 2841, 2046,  856,  451,
        887, 1059, 2047, 2169, 2508, 1820, 1608, 2147, 2273, 3132, 3791,
       3451, 2826, 2270, 2085, 3828, 3040, 2160, 2027, 2081, 2808, 3267,
       3162, 3048, 1234, 1781, 2287, 3900, 3803, 3831, 3187, 3248, 2685,
       3498, 4185, 4275, 3571, 3841, 2448, 2629, 4176, 2693, 3667, 3604,
       1977, 1456, 3328, 3787, 4028, 2931, 3805, 2883, 2071, 2627, 4379,
       4546, 3241, 2415, 2874, 4069, 4134, 1769, 4665, 2948, 3110, 3130,
       3735, 4484, 4896, 4122, 3150, 3253, 4460, 5085, 5315, 5187, 3830,
       4681, 3685, 5171, 5042, 5108, 5537, 5893, 2339, 3464, 4763, 4571,
       5024, 5299, 4663, 3934, 3694, 4728, 5424, 5378, 5265, 4653, 3605,
       2939, 4680, 5099, 4380, 4746, 5146, 4286, 5172, 5702, 4020, 5719,
       5950, 4083,  907, 3019, 5115, 5541, 4551, 5219, 3100, 4075, 4907,
       5087, 5502, 5657, 5227, 4387, 4224, 4990, 4097, 5546, 5711, 4807,
       2501, 4490, 6433, 6142, 6118, 4884, 4425, 3729, 5254, 4494, 5317,
       3681, 3308, 3486, 4863, 6110, 6238, 5325, 5147, 5927, 6033, 6028,
       6456, 6248, 4790, 4374, 5647, 4495, 6183, 6102, 4739, 4344, 4446,
       5857, 5339, 5127, 4859, 4801, 4340, 5640, 6365, 6258, 5958, 4634,
       4110, 5323, 5608, 4841, 4836, 3392, 3469, 5571, 5336, 6289, 6414,
       5988, 5742, 5865, 4914, 5703, 5123, 3195, 4866, 5831, 6452, 6790,
       5825, 5645, 4451, 4444, 6065, 6506, 6278, 5847, 4479, 3757, 5780,
       5995, 6271, 6090, 4721, 4052, 4362, 5676, 5656, 6149, 6267, 5665,
       5038, 3341, 5504, 5925, 6281, 6402, 6257, 3772, 5928, 6105, 6520,
       6541, 5917, 3788, 3197, 5997, 6280, 5592, 6459, 4419, 6407, 6697,
       6820, 6750, 6630, 5554, 5167, 3702, 6803, 6781, 6917, 5883, 5453,
       6435, 6693, 6946, 6642, 6370, 5966, 4874, 6015, 4324, 6844, 6437,
       6640, 4934, 2729, 4604, 5791, 6911, 6736, 6222, 4857, 4559, 6612,
       6482, 6501, 4671, 5284, 4692, 6228, 6625, 6898, 6484, 6262, 5209,
         20, 1009, 5520, 5229, 3906, 4881, 5220, 4709, 4975, 5283, 4562,
       3767, 5122, 5125, 5214, 4316, 3747, 5050, 5100, 4531, 1470, 2307,
       1745, 4750, 3836, 5062, 5080, 5306, 5679, 6055, 5398, 5035, 4659,
       2787, 5009, 5107, 5182, 4280, 4373, 5124, 3814, 3402, 1544, 1379,
        746,  573,  432, 1867, 2451, 1182, 1432, 2290], dtype=int64)
In [41]:
RentalBikesDF['Registered'].min(),RentalBikesDF['Registered'].max()
Out[41]:
(20, 6946)
In [42]:
# solo por evitar problemas de datos, vamos a calcular el conteo = registrados vs Casual. 

RentalBikesDF['cnt']=RentalBikesDF['Registered']+RentalBikesDF['Casual']
In [43]:
RentalBikesDF.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   Date        731 non-null    datetime64[ns]
 2   Season      731 non-null    object        
 3   Holiday     731 non-null    object        
 4   Weather     731 non-null    object        
 5   Temp        731 non-null    float64       
 6   Feel_Temp   731 non-null    float64       
 7   Hum         731 non-null    float64       
 8   Wind        731 non-null    float64       
 9   Casual      731 non-null    int64         
 10  Registered  731 non-null    int64         
 11  cnt         731 non-null    int64         
 12  Day         731 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(5), object(3)
memory usage: 74.4+ KB
In [44]:
# Aca volvemos al outlayer que creamos de temperatura. Básicamente vamos a crear un dfTemporal con los registros que le 
# insertamos la temperatura de 5000 y vamos a reemplazar ese valor por la Sensación terminca (Feel_Temp) para puego actualizar
# el valor en el dfInicial

flt=RentalBikesDF['Temp']==5000
dfTemp=RentalBikesDF[flt]
dfTemp['Temp']=dfTemp['Feel_Temp']
RentalBikesDF['Temp'].update(dfTemp['Temp'])
In [45]:
RentalBikesDF['Temp'].min(),RentalBikesDF['Temp'].max()
Out[45]:
(-5.2208712, 32.498349)
In [46]:
# Ahora vamos a crear una columna que se llame Clima Clomplejo que se definirá cuando hay lluvia o nieve
In [47]:
RentalBikesDF['Weather'].unique()
Out[47]:
array(['MIST + CLOUDY', 'FEW CLOUDS', 'PARTLY CLOUDY', 'CLEAR', 'MIST',
       'MIST + BROKEN CLOUDS', 'MIST + FEW CLOUDS', 'RAIN + THUNDERSTORM',
       'LIGHT RAIN + SCATTERED CLOUDS', 'SNOW'], dtype=object)
In [48]:
RentalBikesDF['Complex Weather']=RentalBikesDF['Weather'].str.contains('RAIN|SNOW|MIST', case=False, regex=True)
In [49]:
# adicionalmente vamos a volver numerico las variables cualitativas que tenemos en el dataframe

enc = OneHotEncoder(handle_unknown='ignore')
salida = enc.fit(RentalBikesDF)
le = LabelEncoder()
In [50]:
RentalBikesDF['Season encoded']=le.fit_transform(RentalBikesDF['Season'])
RentalBikesDF['Holiday encoded']=le.fit_transform(RentalBikesDF['Holiday'])
RentalBikesDF['Weather encoded']=le.fit_transform(RentalBikesDF['Weather'])
In [51]:
# Por ultimo quitamos los elementos duplicados en una fecha especifica

RentalBikesDF.drop_duplicates(subset='Date', keep='first', inplace=True)
In [52]:
RentalBikesDF.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 730
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   instant          730 non-null    int64         
 1   Date             730 non-null    datetime64[ns]
 2   Season           730 non-null    object        
 3   Holiday          730 non-null    object        
 4   Weather          730 non-null    object        
 5   Temp             730 non-null    float64       
 6   Feel_Temp        730 non-null    float64       
 7   Hum              730 non-null    float64       
 8   Wind             730 non-null    float64       
 9   Casual           730 non-null    int64         
 10  Registered       730 non-null    int64         
 11  cnt              730 non-null    int64         
 12  Day              730 non-null    int64         
 13  Complex Weather  730 non-null    bool          
 14  Season encoded   730 non-null    int32         
 15  Holiday encoded  730 non-null    int32         
 16  Weather encoded  730 non-null    int32         
dtypes: bool(1), datetime64[ns](1), float64(4), int32(3), int64(5), object(3)
memory usage: 89.1+ KB
In [53]:
reduceRentalBikeDF=RentalBikesDF[['Date','Day','Season encoded','Holiday encoded','Weather encoded','Temp','Feel_Temp','Hum','Wind','Casual','Registered','cnt','Complex Weather']]
In [54]:
# ANALISIS DE DATOS (15 pts)
#Analice las variables que conforman la base de datos. Realice análisis univariados y
#bivariados según considere. No olvide utilizar técnicas de análisis visual.


#Vamos a empezar a ver la relación que hay entre las diferentes variables con respecto a la cantidad de usuarios (cnt). 

reduceRentalBikeDF.corr()
Out[54]:
Day Season encoded Holiday encoded Weather encoded Temp Feel_Temp Hum Wind Casual Registered cnt Complex Weather
Day 1.000000 0.018371 0.086435 0.013720 -0.004655 -0.000115 -0.023573 -0.003313 -0.030116 0.000847 -0.009989 -0.034540
Season encoded 0.018371 1.000000 0.013723 -0.026869 -0.376631 -0.350124 0.039160 -0.019243 -0.110244 0.013501 -0.028189 0.101000
Holiday encoded 0.086435 0.013723 1.000000 0.031925 -0.028430 -0.032506 -0.015590 0.009121 0.054117 -0.108626 -0.068293 -0.029067
Weather encoded 0.013720 -0.026869 0.031925 1.000000 -0.041807 -0.051932 0.017188 0.047226 -0.081051 -0.110812 -0.117945 0.145497
Temp -0.004655 -0.376631 -0.028430 -0.041807 1.000000 0.991637 -0.026042 -0.147645 0.543933 0.540502 0.627944 -0.115573
Feel_Temp -0.000115 -0.350124 -0.032506 -0.051932 0.991637 1.000000 -0.021284 -0.171689 0.544087 0.544342 0.631091 -0.114990
Hum -0.023573 0.039160 -0.015590 0.017188 -0.026042 -0.021284 1.000000 -0.026677 -0.008588 -0.055753 -0.047936 0.129305
Wind -0.003313 -0.019243 0.009121 0.047226 -0.147645 -0.171689 -0.026677 1.000000 -0.154247 -0.198267 -0.214299 -0.011538
Casual -0.030116 -0.110244 0.054117 -0.081051 0.543933 0.544087 -0.008588 -0.154247 1.000000 0.396230 0.673364 -0.222900
Registered 0.000847 0.013501 -0.108626 -0.110812 0.540502 0.544342 -0.055753 -0.198267 0.396230 1.000000 0.945606 -0.215771
cnt -0.009989 -0.028189 -0.068293 -0.117945 0.627944 0.631091 -0.047936 -0.214299 0.673364 0.945606 1.000000 -0.252719
Complex Weather -0.034540 0.101000 -0.029067 0.145497 -0.115573 -0.114990 0.129305 -0.011538 -0.222900 -0.215771 -0.252719 1.000000
In [55]:
PorcentajeRegistrados=RentalBikesDF['Registered'].sum()/RentalBikesDF['cnt'].sum()
PorcentajeCasual=RentalBikesDF['Casual'].sum()/RentalBikesDF['cnt'].sum()
PorcentajeRegistrados*100,PorcentajeCasual*100
Out[55]:
(81.15113686239788, 18.848863137602127)
In [56]:
# Con esto evidenciamos que el 81% de los biciusuarios es un usuario registrado. El restante 19% es usuario casual
In [57]:
# Lo primero que podemos observar es una alta correlacion entre el total de usuarios y los usuarios registrados (0.945517).
# Esto nos da a entender que, en la medida que suban los usuarios registrados, mi cantidad de usuarios va a subir en una 
# similar. Por eso vamos a enfocar nuestro analisis en el comportamiento de los usuarios registrados. 
In [58]:
reduceRentalBikeDF['cnt'].describe(percentiles = [.25, .5, .75, .95, .99])
Out[58]:
count     730.000000
mean     4503.683562
std      1938.456109
min        22.000000
25%      3146.500000
50%      4544.500000
75%      5966.000000
95%      7576.400000
99%      8163.810000
max      8714.000000
Name: cnt, dtype: float64
In [59]:
plt.figure(figsize = (28, 3))
plt.boxplot(reduceRentalBikeDF['cnt'], vert = False)
plt.show()
In [60]:
plt.figure(figsize = (28, 3))
plt.boxplot(reduceRentalBikeDF['Registered'], vert = False)
plt.show()
In [61]:
# Podemos ver que el comportamiento entre cnt y register es el mismo. Esto refuerza nuestra teoria de explorar que infuye en el
# uso de los usuarios registrados de las bicicletas.
In [62]:
# Cantidad de usuarios por clima

df_cntByWeather= RentalBikesDF[['Registered','Weather']].groupby(['Weather']).sum('Registered')
df_cntByWeather=df_cntByWeather.sort_values('Registered', ascending=True).reset_index()
df_cntByWeather.plot('Weather','Registered', kind = 'bar')
Out[62]:
<AxesSubplot:xlabel='Weather'>
In [63]:
# Con la gráfica nos permite intuir que los usuarios registrados salen no salen cuando nieba o niebla. 
# Salen poco cuando hay niebla y de resto salen. El hecho de que salgan mas cuando puede esta parcialmente nublado puede deberse
# a que la mayor parte del año esta nublado. 
In [64]:
# Cantidad de usuarios por Temporada

df_cntBySeason= RentalBikesDF[['Registered','Season']].groupby(['Season']).sum('Registered')
df_cntBySeason=df_cntBySeason.sort_values('Registered', ascending=True).reset_index()
df_cntBySeason.plot('Season','Registered', kind = 'bar')
Out[64]:
<AxesSubplot:xlabel='Season'>
In [65]:
# Lo primero que podemos observar, visualmente, es que la cantidad de usuarios registrados en otoño es casi el doble que en primavera. Y
# mas curioso, es que el invierno y verano tienen una cantidad de usuarios registrados muy similar: Esto no deja de ser 
# curioso debido a que el indice de correlación entre el total de usuarios registrados vs la temperatura y sensación termica
# es del 0.54 aproximadanente, y las dos estaciones (Verano e Invierno) tienen climas radicalmente opuestos. 

# En este punto podemos empezar a concluir, a priori, que los usuarios no les importa la temporada del año para hacer uso de 
# la bicicleta sino las condiciones climaticas. Ahora revisemos los usuarios registrados vs Dia  
In [66]:
# Dia contra el total de usuarios

plt.figure(figsize = (13, 7))
plt.scatter(reduceRentalBikeDF['Date'], reduceRentalBikeDF['Registered'], alpha = 0.2, color = 'black')
plt.title('Registered Users by Day')
plt.xlabel('Date')
plt.ylabel('Registered Users')
plt.show()
In [67]:
# Temperatura comparado contra el total de usuarios
plt.figure(figsize = (13, 7))
plt.scatter(reduceRentalBikeDF['Registered'], reduceRentalBikeDF['Temp'], alpha = 0.2, color = 'black')
plt.title('Temperature vs. Registered')
plt.xlabel('Registered Users')
plt.ylabel('Temperature')
plt.show()
In [68]:
# Sensación Termica comparado contra el total de usuarios
plt.figure(figsize = (13, 7))
plt.scatter(reduceRentalBikeDF['Registered'],reduceRentalBikeDF['Feel_Temp'], alpha = 0.2, color = 'blue')
plt.title('Feel Temperature vs. Registered Users')
plt.xlabel('Registered Users')
plt.ylabel('Feel Temp')
plt.show()
In [69]:
# Cantidad de usuarios teniendo en cuenta si hay clima complejo o no

df_cntByClxWeather= RentalBikesDF[['Complex Weather','Registered']].groupby(['Complex Weather']).sum('Registered')
df_cntByClxWeather=df_cntByClxWeather.sort_values('Registered', ascending=True).reset_index()
df_cntByClxWeather.plot('Complex Weather','Registered', kind = 'bar')
Out[69]:
<AxesSubplot:xlabel='Complex Weather'>
In [70]:
# Por ultimo, respondiendo la inquietud sobre el comportamiento de los usuarios durante la semana.

df_cntByDay= RentalBikesDF[['Day','cnt']].groupby(['Day']).sum('cnt')
df_cntByDay=df_cntByDay.sort_values('Day', ascending=True).reset_index()
df_cntByDay.plot('Day','cnt', kind = 'bar')
Out[70]:
<AxesSubplot:xlabel='Day'>
In [71]:
# Evidenciamos un comportamiento homogeneo en el uso de la bicicleta los 7 días de la semana.
In [72]:
df_cntByDay= RentalBikesDF[['Day','Registered']].groupby(['Day']).sum('Registered')
df_cntByDay=df_cntByDay.sort_values('Day', ascending=True).reset_index()
df_cntByDay.plot('Day','Registered', kind = 'bar')
Out[72]:
<AxesSubplot:xlabel='Day'>
In [73]:
# Sucede algo similar con los usuarios registrados, aunque con una leve disminución los días miercoles
In [74]:
profile = ProfileReport(reduceRentalBikeDF)
profile.to_notebook_iframe()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
In [75]:
# DESARROLLO DE MODELOS DE REGRESION (30 pts)
# Implemente al menos 3 modelos basados en el algoritmo de regresión lineal: uno simple,
# uno polinomial y uno con algún tipo de regularización.
In [76]:
# REGRESION LINEAL. Si bien es cierto, la variable objetivo es cnt, la voy a cambiar por Registered dado su alta correlación
# entre una y otra. Casual y cnt las voy a quitar.

lin_reg = LinearRegression()
x = reduceRentalBikeDF=RentalBikesDF[['Day','Season encoded','Holiday encoded','Weather encoded','Temp','Feel_Temp','Hum','Wind','Complex Weather']]
y = reduceRentalBikeDF=RentalBikesDF[['Registered']]

# Creando el set d epruebas y el set de entrenamiento
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)
In [77]:
# Escalando caracteristicas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
regr = LinearRegression()
regr.fit(X_train_scaled, Y_train)
Out[77]:
LinearRegression()
In [78]:
regr.coef_
Out[78]:
array([[   9.62451442,  412.70208952, -147.7406142 , -106.53276404,
        1078.09233346, -138.4472879 ,  -37.32072523, -145.76660883,
        -221.16056103]])
In [79]:
regr.intercept_
Out[79]:
array([3665.60730594])
In [80]:
preds_train = regr.predict(X_train_scaled)
preds_test = regr.predict(X_test_scaled)
In [81]:
MAE_LinealRegretion=mean_absolute_error(Y_train, preds_train), mean_absolute_error(Y_test, preds_test)
MAE_LinealRegretion
Out[81]:
(1027.2189842592843, 936.009932189698)
In [82]:
RMSE_LinealRegretion=np.sqrt(mean_squared_error(Y_train, preds_train)), np.sqrt(mean_squared_error(Y_test, preds_test))
RMSE_LinealRegretion
Out[82]:
(1216.9322318690988, 1142.1169879336437)
In [83]:
plt.figure(figsize = (28, 3))
plt.boxplot((Y_test - preds_test), vert = False)
plt.grid()
plt.show()
In [84]:
# REGRESION POLINOMIAL RIDGE. 

# Definiendo un nuevo el set de pruebas y el set de entrenamiento. Solo para poder "jugar" con el grado, alpha y tamaño del 
# set de forma independiente a los demás modelos.

x = reduceRentalBikeDF=RentalBikesDF[['Day','Season encoded','Holiday encoded','Weather encoded','Temp','Feel_Temp','Hum','Wind','Complex Weather']]
y = reduceRentalBikeDF=RentalBikesDF[['Registered']]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.40, random_state = 1)

poly_features = PolynomialFeatures(degree=3,include_bias=False)
x_poly = poly_features.fit_transform(X_train)
ridge_df = Ridge(alpha=1)
ridge_df.fit(x_poly,Y_train)
df_poly_training=ridge_df.predict(x_poly)
x_poly_test = poly_features.fit_transform(X_test)
ridge_df_test = Ridge(alpha=60)
ridge_df_test.fit(x_poly_test,Y_test)
df_poly_test=ridge_df_test.predict(x_poly_test)
In [85]:
MAE_RidgeRegretion=mean_absolute_error(Y_train, df_poly_training), mean_absolute_error(Y_test, df_poly_test)
MAE_RidgeRegretion
Out[85]:
(693.5300766119352, 678.1109221226349)
In [86]:
RMSE_RidgeRegretion=np.sqrt(mean_squared_error(Y_train, df_poly_training)), np.sqrt(mean_squared_error(Y_test, df_poly_test))
RMSE_RidgeRegretion
Out[86]:
(858.5475611776061, 847.8555504017984)
In [87]:
# REGRESION POLINOMIAL LASSO. 
In [88]:
#Para lasso, debemos reducir el tamaño de las variables, por lo que incluiremos solamente las que consideramos primordiales

x = reduceRentalBikeDF=RentalBikesDF[['Weather encoded','Temp','Feel_Temp','Complex Weather']]
y = reduceRentalBikeDF=RentalBikesDF[['Registered']]

# Definiendo un nuevo el set de pruebas y el set de entrenamiento. Solo para poder "jugar" con el grado, alpha y tamaño del 
# set de forma independiente a los demás modelos.

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.25, random_state = 1)

lasso_features = PolynomialFeatures(degree=20,include_bias=False)
x_lasso = lasso_features.fit_transform(X_train)

lasso_df = Lasso(alpha=3)
lasso_df.fit(x_lasso,Y_train)
df_lasso_training=lasso_df.predict(x_lasso)
x_lasso_test = lasso_features.fit_transform(X_test)
lasso_df_test = Lasso(alpha=3)
lasso_df_test.fit(x_lasso_test,Y_test)
Out[88]:
Lasso(alpha=3)
In [89]:
df_lasso_test=lasso_df_test.predict(x_lasso_test)
MAE_Laso=mean_absolute_error(Y_train, df_lasso_training), mean_absolute_error(Y_test, df_lasso_test)
MAE_Laso
Out[89]:
(902.951383186066, 849.2454434237974)
In [90]:
RMSE_Lasso=np.sqrt(mean_squared_error(Y_train, df_lasso_training)), np.sqrt(mean_squared_error(Y_test, df_lasso_test))
RMSE_Lasso
Out[90]:
(1108.728861880623, 1064.1205770467388)
In [91]:
# EVALUACION DE MODELOS (20 pts)
# Con base en el desempeño de cada uno de los modelos, concluya cuál es el modelo que se
# le debe presentar a la alcaldía. Justifique su respuesta.
In [92]:
MAE_LinealRegretion, MAE_RidgeRegretion, MAE_Laso
Out[92]:
((1027.2189842592843, 936.009932189698),
 (693.5300766119352, 678.1109221226349),
 (902.951383186066, 849.2454434237974))
In [93]:
RMSE_LinealRegretion, RMSE_RidgeRegretion, RMSE_Lasso
Out[93]:
((1216.9322318690988, 1142.1169879336437),
 (858.5475611776061, 847.8555504017984),
 (1108.728861880623, 1064.1205770467388))
In [94]:
# Al comparar la error medio absoluto entre los tres modelos, tengo un valor menos alto al utilizar la regresión polinomial ridge 
# y una diferecia no tan alta entre el MEAN de entrenamiento y el MEAN de pruebas, lo que me permite evidenciar que no tiene problemas
# de sobreentrenamiento y/o subentrenamiento. 
# En el caso del modelo LASSO, me estoy enfocando en unas pocas variables y para que la media del valor absoluto baje debo 
# aumentar bastante el nivel del grad y a pesar que lo lleve a grado 20 no baja el MEAN a niveles razonables
# Dado lo anterior, considero que se le debe presentar a la alcaldía el modelo de regresión Ridge.
In [95]:
# INTERPRETACION (10 pts)
# El día de la presentación de resultados, una persona de la alcaldía le hace las siguientes preguntas:
# - ¿Cuáles son las 3 variables más importantes para la predicción de la cantidad de usuarios?
In [96]:
RentalBikesDF[['Weather encoded','Temp','Feel_Temp','Casual','Registered','cnt','Complex Weather']].corr()
Out[96]:
Weather encoded Temp Feel_Temp Casual Registered cnt Complex Weather
Weather encoded 1.000000 -0.041807 -0.051932 -0.081051 -0.110812 -0.117945 0.145497
Temp -0.041807 1.000000 0.991637 0.543933 0.540502 0.627944 -0.115573
Feel_Temp -0.051932 0.991637 1.000000 0.544087 0.544342 0.631091 -0.114990
Casual -0.081051 0.543933 0.544087 1.000000 0.396230 0.673364 -0.222900
Registered -0.110812 0.540502 0.544342 0.396230 1.000000 0.945606 -0.215771
cnt -0.117945 0.627944 0.631091 0.673364 0.945606 1.000000 -0.252719
Complex Weather 0.145497 -0.115573 -0.114990 -0.222900 -0.215771 -0.252719 1.000000
In [97]:
# RTA: La primera, es la cantidad de usuarios registrados-casuales (Es una variable con altacorrelación). La segunda es
# la sensación termica (FeelTemp) y la temperatura (temp), por ultimo tenemos el estado del clima. (Si llueve, Nieva o hace 
# neblina la gente no sale.
In [98]:
# - Describa cual es el escenario ideal para el incremento de usuarios.
# RTA: En los primero analisis des este informe se puede evidenciar que hay dos variables que inciden fuertemente: Temperatura que esta
# relacionada con la sensación termica y que no haga mal clima (puede estar nublado desde que no llueva, nieve o haga neblina).
# Es decir, Día sin mal clima y entre mayor temperatura mejor.
In [99]:
# - ¿Qué pasos adicionales deberían tener en cuenta para una próxima iteración/mejora del modelo?
In [ ]:
# RTA: Agregar mayor cantidad de variables (no solo clima y temperatura). Por ejemplo, estrato, si la persona tiene vehiculo,
# distancia que se desplaza, zona donde vive, razón por la que se desplaza, entre otros. Enriquecer el modelo